/**********************************************************************
gc_4_akm.do

**********************************************************************/
**********
* SET UP *
**********
clear all
set matsize 2000
set more 1

* location for dofiles *
cd "T:\_Projet_4915\dofiles"
global dir "`c(pwd)'"
cd $dir

sysdir set PERSONAL "T:\_Projet_4915\ado"

cap ado uninstall ftools
cap ado uninstall reghdfe

net install ftools, from (T:\_Projet_4915\ado\ftools-master\src)

net install reghdfe, from (T:\_Projet_4915\ado\reghdfe-master\src)

*********
* GATES *
*********
* Specify which data you want to work with (synthetic = syn, real = rl) *
local ext = "rl"

* Specify which gender you to process (men = men, women = women) *
local gender = "men"

* STEP 1: prepare dataset for analysis *
local gate1 = 1

* STEP 2: run akm analysis *
local gate2 = 1

* STEP 3: delete all intermediate datasets *
local gate3 = 1

* start log file *
quietly capture log close
quietly log using gc_4_akm_`ext', text replace

* specify file locations *
global project_folder "\_Projet_4915"
global data_folder "\_Projet_4915\DATA"
global output_folder "\_Projet_4915\ResultsFolder"
global temp "temp"

local datadir T:\${data_folder}\

*******************************************
* MAKE A GLOBAL VARIABLE FOR TODAY'S DATE *
*******************************************
local tyr = substr("$S_DATE",8,4)
local tmo = substr("$S_DATE",4,3)
local tmd = trim(substr("$S_DATE",1,2))

* make day of month two digits *
local wl = length("`tmd'")
if `wl'==1 {
local tmd2 ="0"+"`tmd'"
}
if `wl'!=1 {
local tmd2 ="`tmd'"
}

* get numeric month, make it two digits *
local di="`tmd2'"+"`tmo'"+"`tyr'"
local edate = date("`di'", "DMY")
local mono = month(`edate')
local ml = length("`mono'")
if `ml'==1 {
local mono2 ="0"+"`mono'"
}
if `ml'!=1 {
local mono2 ="`mono'"
}

* put final date together *
global date = "`tyr'"+"`mono2'"+"`tmd2'"

****************************************
* DEFINE LITTLE PROGRAMS TO PRINT TIME *
****************************************
program define starttime
	display "Started processing at $S_TIME on $S_DATE"
end

program define endtime
	display "Finished processing at $S_TIME on $S_DATE"
end

************************
* START OF THE PROGRAM *
************************

****************************************
* STEP 1: prepare dataset for analysis *
****************************************
disp "***** Started processing STEP 1 *****"
starttime
if 1 == `gate1' ///
{
	disp "***** STEP 1: prepare dataset for analysis *****"
	
	local datadir T:\${data_folder}\
	use "`datadir'gc_individual_panel_`gender'_`ext'.dta", clear
	sort pid tax_yr
	
	if "`gender'" == "men" ///
	{
		keep if sex == 1
	}

	if "`gender'" == "women" ///
	{
		keep if sex == 2
	}
	
	************************************************************************
	* Construct various occupational categories based on sources of income *
	************************************************************************
	* nonemployed *
	gen occ_v1 = 0
	* incorporated *
	replace occ_v1 = 3 if y_3_all >= y_0
	* unincorporated *
	replace occ_v1 = 2 if occ_v1 == 0 & y_2 >= y_0
	* paid employee *
	replace occ_v1 = 1 if occ_v1 == 0 & y_1 >= y_0
	label var occ_v1 "Occupation, v1"
	* construct relevant measure of income *
	gen y_v1 = .
	replace y_v1 = y_1 if occ_v1 == 1
	replace y_v1 = y_2 if occ_v1 == 2
	replace y_v1 = y_3_all if occ_v1 == 3
	label var y_v1 "Annual income (main occupation), v1"
	
	*********************
	* keep only workers *
	*********************
	keep if occ_v1 == 1
		
	****************************
	* drop if missing firm eid *
	****************************
	drop if eid_long == ""
	
	* display the number of individuals *
	by pid: gen counter = _n
	count if counter == 1
	display "There are now `r(N)' individuals in the `gender' dataset"
	drop counter
	
	* keep key variables *
	keep pid eid_long tax_yr ///
	dob_yr age ///
	y_v1
	
	* order key variables *
	order pid eid_long tax_yr ///
	dob_yr age ///
	y_v1
	
	* save intermediate dataset to the data_folder *
	compress
	save "`datadir'gc_akm_panel_v1_`gender'_`ext'.dta", replace
	clear

	disp "***** STEP 1: prepare dataset for analysis (COMPLETED) *****"
}
disp "***** Finished processing STEP 1 *****"
endtime

****************************
* STEP 2: run akm analysis *
****************************
disp "***** Started processing STEP 2 *****"
starttime
if 1 == `gate2' ///
{
	disp "***** STEP 2: run akm analysis *****"

	use "`datadir'gc_akm_panel_v1_`gender'_`ext'.dta", clear
	
	gen log_y = log(y_v1)
	gen age40 = (age-40)/40
	gen age40_sq = age40*age40
	gen age40_cu = age40_sq*age40
	destring pid, gen(pid_int)
	destring eid_long, gen(eid_int)
	
	reghdfe log_y i.tax_yr age40_sq age40_cu, absorb(person_fe=pid_int firm_fe=eid_int) timeit verbose(1) groupv(group_id) version(5)
	compress
	save "`datadir'gc_akm_panel_v1_`gender'_post_`ext'.dta", replace
	clear
	
	* save firm fixed effects *
	use "`datadir'gc_akm_panel_v1_`gender'_post_`ext'.dta", clear
	keep if group_id == 1
	drop if missing(person_fe)
	drop if missing(firm_fe)
	collapse (mean) firm_fe person_fe, by(eid_long)
	compress
	save "`datadir'gc_akm_v1_`gender'_firm_fe_`ext'.dta", replace
	clear
	
	disp "***** STEP 2: run akm analysis (COMPLETED) *****"
}
disp "***** Finished processing STEP 2 *****"
endtime

********************************************
* STEP 3: delete all intermediate datasets *
********************************************
disp "***** Started processing STEP 3 *****"
starttime
if 1 == `gate3' ///
{
	disp "***** STEP 3: delete all intermediate datasets *****"
	
	capture erase "`datadir'gc_akm_panel_v1_`gender'_`ext'.dta"
			
	disp "***** STEP 3: delete all intermediate datasets (COMPLETED) *****"
}
disp "***** Finished processing STEP 3 *****"
endtime

********
* EXIT *
********
clear all
log close
